
## Generate gender variables for attorneys
## Zihao Li

# All gender variables are based on count=100, prob=0.9. For other specifications, change varnames accordingly.
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import re

dir = r'/Volumes/Zihao_SSD2/PatentsView/'

def extract_suffix(column_name):
    match = re.search(r'\d{2}_\d{2,3}$', column_name)
    return match.group(0) if match else None

def main():
    ## Load attorney gender data: g_attorney_gender_temp.dta
    print('Loading gender data...')
    df_orig = pd.read_stata(dir + 'temp/g_attorney_gender_temp.dta')
    print('Shape of original dataset is {}.'.format(df_orig.shape)) # (3345804, 15)

    columns_to_aggregate = ['a_gender_09_100', 'a_gender_09_50', 'a_gender_08_100', 'a_gender_08_50']
    agg_dict = {col: lambda x: x.tolist() for col in columns_to_aggregate}
    df = df_orig[['patent_id_i'] + columns_to_aggregate].groupby('patent_id_i').agg(agg_dict).reset_index().astype({'patent_id_i': str}).sort_values(by=['patent_id_i'])
    print('Shape of groupby dataset (grouped by patent_id) is {}.'.format(df.shape)) # (2407270, 5)
    df['patent_id_i'] = df['patent_id_i'].astype(str)

    ## Generate gender variables
    print('Generating gender variables...')
    suffixes = [extract_suffix(col) for col in columns_to_aggregate]

    # Indicator variables
    for suffix in suffixes:
        col_name = f'a_gender_{suffix}'
        print(f"Column: {col_name}, Suffix: {suffix}")
        df[f'a_existfemale_{suffix}'] = [1 if 'female' in lst else 0 for lst in df[col_name]]
        df.loc[~df[col_name].isna(), f'a_allfemale_{suffix}'] = df.loc[~df[col_name].isna()].apply(lambda x: 1 if (('male' not in x[col_name]) & ('ambiguous' not in x[col_name])) else 0, axis=1).astype(int)
        df.loc[~df[col_name].isna(), f'a_allmale_{suffix}'] = df.loc[~df[col_name].isna()].apply(lambda x: 1 if (('female' not in x[col_name]) & ('ambiguous' not in x[col_name])) else 0, axis=1).astype(int)
        df.loc[~df[col_name].isna(), f'a_mixed_{suffix}'] = df.loc[~df[col_name].isna()].apply(lambda x: 1 if (('male' in x[col_name]) & ('female' in x[col_name])) else 0, axis=1).astype(int)
        df.loc[~df[col_name].isna(), f'a_undetermined_{suffix}'] = ((df[f'a_allfemale_{suffix}'] == 0) & (df[f'a_allmale_{suffix}'] == 0) & (df[f'a_mixed_{suffix}'] == 0)).astype(int)

    # Categorical genderdiscrete variable
    print('Gender discrete...')
    for suffix in suffixes:
        col_name = f'a_gender_{suffix}'
        print(f"Column: {col_name}, Suffix: {suffix}")
        df.loc[df[f'a_allfemale_{suffix}'] == 1, f'a_genderdiscrete_{suffix}'] = 'allfemale'
        df.loc[df[f'a_allmale_{suffix}'] == 1, f'a_genderdiscrete_{suffix}'] = 'allmale'
        df.loc[df[f'a_mixed_{suffix}'] == 1, f'a_genderdiscrete_{suffix}'] = 'mixed'
        df.loc[df[f'a_undetermined_{suffix}'] == 1, f'a_genderdiscrete_{suffix}'] = 'undetermined'

    # Drop patent_id with letters, and convert patent_id to numeric
    print('Dropping non-numeric patent_id and converting to numeric...')
    df = df[~df["patent_id_i"].str.contains("[a-zA-Z]")]
    df["patent_id_i"] = pd.to_numeric(df["patent_id_i"], errors="raise")
    print('Shape of final dataset is {}.'.format(df.shape)) # (2153586, 29)

    rename_mapping = {col: f"{col}_list" for col in columns_to_aggregate}
    df.rename(columns=rename_mapping, inplace=True)

    print('Exporting the final dataset...')
    df.to_csv(dir + 'cleandata/g_attorney_clean.csv', index=False)


if __name__ == '__main__':
    main()